From 0068fb5745870c50ea428294f7ecd3dcf733eaf7 Mon Sep 17 00:00:00 2001 From: Daniel Sabo Date: Sun, 27 Dec 2015 07:29:55 -0800 Subject: [PATCH] Add support for hardware half<->float conversions These instructions require a Ivy Bridge or newer processor, so I've only been able to test them under the Intel Software Development Emulator. --- babl/babl-cpuaccel.c | 6 +- babl/babl-cpuaccel.h | 3 + configure.ac | 22 ++++ extensions/Makefile.am | 3 + extensions/sse-half.c | 270 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 extensions/sse-half.c diff --git a/babl/babl-cpuaccel.c b/babl/babl-cpuaccel.c index 4e1683e..59fdcdd 100644 --- a/babl/babl-cpuaccel.c +++ b/babl/babl-cpuaccel.c @@ -118,7 +118,8 @@ enum ARCH_X86_INTEL_FEATURE_SSSE3 = 1 << 9, ARCH_X86_INTEL_FEATURE_SSE4_1 = 1 << 19, ARCH_X86_INTEL_FEATURE_SSE4_2 = 1 << 20, - ARCH_X86_INTEL_FEATURE_AVX = 1 << 28 + ARCH_X86_INTEL_FEATURE_AVX = 1 << 28, + ARCH_X86_INTEL_FEATURE_F16C = 1 << 29, }; #if !defined(ARCH_X86_64) && (defined(PIC) || defined(__PIC__)) @@ -244,6 +245,9 @@ arch_accel_intel (void) if (ecx & ARCH_X86_INTEL_FEATURE_SSE4_1) caps |= BABL_CPU_ACCEL_X86_SSE4_1; + + if (ecx & ARCH_X86_INTEL_FEATURE_F16C) + caps |= BABL_CPU_ACCEL_X86_F16C; #endif /* USE_SSE */ } #endif /* USE_MMX */ diff --git a/babl/babl-cpuaccel.h b/babl/babl-cpuaccel.h index 57eb118..8040d73 100644 --- a/babl/babl-cpuaccel.h +++ b/babl/babl-cpuaccel.h @@ -32,6 +32,9 @@ typedef enum BABL_CPU_ACCEL_X86_SSE3 = 0x02000000, BABL_CPU_ACCEL_X86_SSSE3 = 0x00800000, BABL_CPU_ACCEL_X86_SSE4_1 = 0x00400000, + /* BABL_CPU_ACCEL_X86_SSE4_2 = 0x00200000, */ + /* BABL_CPU_ACCEL_X86_AVX = 0x00080000, */ + BABL_CPU_ACCEL_X86_F16C = 0x00040000, /* powerpc accelerations */ BABL_CPU_ACCEL_PPC_ALTIVEC = 0x04000000, diff --git a/configure.ac b/configure.ac index f09c7ac..28e9af0 100644 --- a/configure.ac +++ b/configure.ac @@ -303,6 +303,10 @@ AC_ARG_ENABLE(sse4_1, [ --enable-sse4_1 enable SSE4_1 support (default=auto)],, enable_sse4_1=$enable_sse) +AC_ARG_ENABLE(f16c, + [ --enable-f16c enable hardware half-float support (default=auto)],, + enable_f16c=$enable_sse) + if test "x$enable_mmx" = xyes; then BABL_DETECT_CFLAGS(MMX_EXTRA_CFLAGS, '-mmmx') SSE_EXTRA_CFLAGS= @@ -378,6 +382,24 @@ if test "x$enable_mmx" = xyes; then fi fi + if test "x$enable_f16c" = xyes; then + BABL_DETECT_CFLAGS(f16c_flag, '-mf16c') + SSE4_1_EXTRA_CFLAGS="$SSE_EXTRA_CFLAGS $f16c_flag" + + AC_MSG_CHECKING(whether we can compile half-floating point code) + + CFLAGS="$CFLAGS $sse_flag $f16c_flag" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([#include ],[_mm_cvtph_ps ((__m128i)_mm_setzero_ps());])], + AC_DEFINE(USE_F16C, 1, [Define to 1 if f16c intrinsics are available.]) + AC_MSG_RESULT(yes) + , + enable_f16c=no + AC_MSG_RESULT(no) + AC_MSG_WARN([The compiler does not support f16c intrinsics.]) + ) + fi + fi , enable_mmx=no diff --git a/extensions/Makefile.am b/extensions/Makefile.am index cd7e893..c06aa8f 100644 --- a/extensions/Makefile.am +++ b/extensions/Makefile.am @@ -32,6 +32,7 @@ ext_LTLIBRARIES = \ sse2-int8.la \ sse2-int16.la \ sse4-int8.la \ + sse-half.la \ two-table.la \ ycbcr.la @@ -50,6 +51,7 @@ sse2_float_la_SOURCES = sse2-float.c sse2_int8_la_SOURCES = sse2-int8.c sse2_int16_la_SOURCES = sse2-int16.c sse4_int8_la_SOURCES = sse4-int8.c +sse_half_la_SOURCES = sse-half.c two_table_la_SOURCES = two-table.c two-table-tables.h ycbcr_la_SOURCES = ycbcr.c float_la_SOURCES = float.c @@ -62,3 +64,4 @@ sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS) sse2_int8_la_CFLAGS = $(SSE2_EXTRA_CFLAGS) sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS) sse4_int8_la_CFLAGS = $(SSE4_1_EXTRA_CFLAGS) +sse_half_la_CFLAGS = $(SSE4_1_EXTRA_CFLAGS) $(F16C_EXTRA_CFLAGS) diff --git a/extensions/sse-half.c b/extensions/sse-half.c new file mode 100644 index 0000000..ca57ceb --- /dev/null +++ b/extensions/sse-half.c @@ -0,0 +1,270 @@ +/* babl - dynamically extendable universal pixel conversion library. + * Copyright (C) 2015 Daniel Sabo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General + * Public License along with this library; if not, see + * . + */ + +#include "config.h" + +#if defined(USE_SSE4_1) && defined(USE_F16C) + +#include + +#include +#include + +#include "babl.h" +#include "babl-cpuaccel.h" +#include "extensions/util.h" + +static inline long +conv_yHalf_yF (const uint16_t *src, float *dst, long samples) +{ + const uint64_t *s_vec; + __v4sf *d_vec; + + long n = samples; + + s_vec = (const uint64_t *)src; + d_vec = (__v4sf *)dst; + + while (n >= 4) + { + __m128i in_val = _mm_insert_epi64((__m128i)_mm_setzero_ps(), *s_vec++, 0); + __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val); + _mm_storeu_ps((float *)d_vec++, out_val); + n -= 4; + } + + src = (const uint16_t *)s_vec; + dst = (float *)d_vec; + + while (n) + { + __m128i in_val = _mm_insert_epi16((__m128i)_mm_setzero_ps(), *src++, 0); + __v4sf out_val = (__v4sf)_mm_cvtph_ps(in_val); + _mm_store_ss(dst++, out_val); + n -= 1; + } + + return samples; +} + +static long +conv_yaHalf_yaF (const uint16_t *src, float *dst, long samples) +{ + return conv_yHalf_yF (src, dst, samples * 2) / 2; +} + +static long +conv_rgbHalf_rgbF (const uint16_t *src, float *dst, long samples) +{ + return conv_yHalf_yF (src, dst, samples * 3) / 3; +} + +static long +conv_rgbaHalf_rgbaF (const uint16_t *src, float *dst, long samples) +{ + return conv_yHalf_yF (src, dst, samples * 4) / 4; +} + +static inline long +conv_yF_yHalf (const float *src, uint16_t *dst, long samples) +{ + const __v4sf *s_vec; + uint64_t *d_vec; + + long n = samples; + + s_vec = (const __v4sf *)src; + d_vec = (uint64_t *)dst; + + while (n >= 4) + { + __m128 in_val = _mm_loadu_ps((float *)s_vec++); + __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + _mm_storel_epi64((__m128i *)d_vec++, out_val); + n -= 4; + } + + src = (const float *)s_vec; + dst = (uint16_t *)d_vec; + + while (n) + { + __m128 in_val = _mm_load_ss(src++); + __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + *dst++ = _mm_extract_epi16(out_val, 0); + n -= 1; + } + + return samples; +} + +static long +conv_yaF_yaHalf (const float *src, uint16_t *dst, long samples) +{ + return conv_yF_yHalf (src, dst, samples * 2) / 2; +} + +static long +conv_rgbF_rgbHalf (const float *src, uint16_t *dst, long samples) +{ + return conv_yF_yHalf (src, dst, samples * 3) / 3; +} + +static long +conv_rgbaF_rgbaHalf (const float *src, uint16_t *dst, long samples) +{ + return conv_yF_yHalf (src, dst, samples * 4) / 4; +} + +#endif /* defined(USE_SSE4_1) && defined(USE_F16C) */ + +int init (void); + +int +init (void) +{ +#if defined(USE_SSE4_1) && defined(USE_F16C) + const Babl *rgbaF_linear = babl_format_new ( + babl_model ("RGBA"), + babl_type ("float"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + babl_component ("A"), + NULL); + const Babl *rgbaHalf_linear = babl_format_new ( + babl_model ("RGBA"), + babl_type ("half"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + babl_component ("A"), + NULL); + const Babl *rgbaF_gamma = babl_format_new ( + babl_model ("R'G'B'A"), + babl_type ("float"), + babl_component ("R'"), + babl_component ("G'"), + babl_component ("B'"), + babl_component ("A"), + NULL); + const Babl *rgbaHalf_gamma = babl_format_new ( + babl_model ("R'G'B'A"), + babl_type ("half"), + babl_component ("R'"), + babl_component ("G'"), + babl_component ("B'"), + babl_component ("A"), + NULL); + const Babl *rgbF_linear = babl_format_new ( + babl_model ("RGB"), + babl_type ("float"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + NULL); + const Babl *rgbHalf_linear = babl_format_new ( + babl_model ("RGB"), + babl_type ("half"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + NULL); + const Babl *rgbF_gamma = babl_format_new ( + babl_model ("R'G'B'"), + babl_type ("float"), + babl_component ("R'"), + babl_component ("G'"), + babl_component ("B'"), + NULL); + const Babl *rgbHalf_gamma = babl_format_new ( + babl_model ("R'G'B'"), + babl_type ("half"), + babl_component ("R'"), + babl_component ("G'"), + babl_component ("B'"), + NULL); + const Babl *yaF_linear = babl_format_new ( + babl_model ("YA"), + babl_type ("float"), + babl_component ("Y"), + babl_component ("A"), + NULL); + const Babl *yaHalf_linear = babl_format_new ( + babl_model ("YA"), + babl_type ("half"), + babl_component ("Y"), + babl_component ("A"), + NULL); + const Babl *yaF_gamma = babl_format_new ( + babl_model ("Y'A"), + babl_type ("float"), + babl_component ("Y'"), + babl_component ("A"), + NULL); + const Babl *yaHalf_gamma = babl_format_new ( + babl_model ("Y'A"), + babl_type ("half"), + babl_component ("Y'"), + babl_component ("A"), + NULL); + const Babl *yF_linear = babl_format_new ( + babl_model ("Y"), + babl_type ("float"), + babl_component ("Y"), + NULL); + const Babl *yHalf_linear = babl_format_new ( + babl_model ("Y"), + babl_type ("half"), + babl_component ("Y"), + NULL); + const Babl *yF_gamma = babl_format_new ( + babl_model ("Y'"), + babl_type ("float"), + babl_component ("Y'"), + NULL); + const Babl *yHalf_gamma = babl_format_new ( + babl_model ("Y'"), + babl_type ("half"), + babl_component ("Y'"), + NULL); + +#define CONV(src, dst) \ +{ \ + babl_conversion_new (src ## _linear, dst ## _linear, "linear", conv_ ## src ## _ ## dst, NULL); \ + babl_conversion_new (src ## _gamma, dst ## _gamma, "linear", conv_ ## src ## _ ## dst, NULL); \ +} + + if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE4_1) && + (babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_F16C)) + { + CONV(rgbaHalf, rgbaF); + CONV(rgbHalf, rgbF); + CONV(yaHalf, yaF); + CONV(yHalf, yF); + CONV(rgbaF, rgbaHalf); + CONV(rgbF, rgbHalf); + CONV(yaF, yaHalf); + CONV(yF, yHalf); + } + +#endif /* defined(USE_SSE4_1) && defined(USE_F16C) */ + + return 0; +} + -- 2.30.2